In [1]:
import os
import plotly.io as pio
import helpsk as hlp

pio.renderers.default='notebook'

def get_project_directory():
    return os.getcwd().\
        replace('/develop', '').\
        replace('/deliver', '').\
        replace('/archive', '').\
        replace('/code/notebooks', '')

Results¶

In [2]:
file_name = os.path.join(get_project_directory(), 'artifacts/models/experiments', 'multi-model-BayesSearchCV-2022-03-10-13-41-33.yaml')
In [3]:
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)

Best Scores/Params¶

In [4]:
results.best_score
Out[4]:
0.774092779234226
In [5]:
results.best_params
Out[5]:
{'model': 'LogisticRegression()',
 'C': 0.13184996310179986,
 'imputer': "SimpleImputer(strategy='median')",
 'scaler': 'StandardScaler()',
 'pca': 'None',
 'encoder': 'OneHotEncoder()'}
In [6]:
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
Out[6]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split ... subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder model_rank
5 1 0.774 0.730 0.818 LogisticRegression() 0.131850 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder() 1.0
18 2 0.767 0.720 0.814 RandomForestClassifier() NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer() None None OneHotEncoder() 1.0
7 4 0.761 0.720 0.803 LinearSVC() 0.280746 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer(strategy='most_frequent') MinMaxScaler() PCA('mle') OneHotEncoder() 1.0
16 6 0.760 0.701 0.819 ExtraTreesClassifier() NaN 0.68466 30.0 1659.0 25.0 ... NaN NaN NaN NaN NaN SimpleImputer() None PCA('mle') OneHotEncoder() 1.0
28 9 0.753 0.710 0.796 XGBClassifier() NaN NaN 5.0 1246.0 NaN ... 0.95619 0.694741 0.518639 0.242199 1.220693 SimpleImputer(strategy='median') None None OneHotEncoder() 1.0

5 rows × 25 columns

In [7]:
results.to_formatted_dataframe(return_style=True,
                               include_rank=True,
                               num_rows=1000)
Out[7]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion learning_rate min_child_weight subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder
1 0.774 0.730 0.818 LogisticRegression() 0.132 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
2 0.767 0.720 0.814 RandomForestClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
3 0.763 0.725 0.802 LogisticRegression() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
4 0.761 0.720 0.803 LinearSVC() 0.281 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') MinMaxScaler() PCA('mle') OneHotEncoder()
5 0.761 0.697 0.825 LogisticRegression() 0.001 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() None OneHotEncoder()
6 0.760 0.701 0.819 ExtraTreesClassifier() <NA> 0.685 30.000 1,659.000 25.000 11.000 0.781 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') OneHotEncoder()
7 0.755 0.714 0.796 RandomForestClassifier() <NA> 0.599 70.000 1,858.000 39.000 22.000 0.851 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
8 0.753 0.716 0.791 RandomForestClassifier() <NA> 0.303 81.000 1,063.000 15.000 27.000 0.502 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None None OneHotEncoder()
9 0.753 0.710 0.796 XGBClassifier() <NA> <NA> 5.000 1,246.000 <NA> <NA> <NA> <NA> 0.023 15.000 0.956 0.695 0.519 0.242 1.221 SimpleImputer(strategy='median') None None OneHotEncoder()
10 0.752 0.698 0.805 ExtraTreesClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
11 0.751 0.721 0.781 LinearSVC() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
12 0.751 0.714 0.788 RandomForestClassifier() <NA> 0.323 76.000 1,619.000 31.000 36.000 0.595 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
13 0.749 0.698 0.801 XGBClassifier() <NA> <NA> 1.000 1,974.000 <NA> <NA> <NA> <NA> 0.024 4.000 0.543 0.620 0.876 0.034 1.445 SimpleImputer() None PCA('mle') OneHotEncoder()
14 0.749 0.706 0.792 ExtraTreesClassifier() <NA> 0.408 87.000 1,423.000 25.000 19.000 0.989 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
15 0.747 0.694 0.799 ExtraTreesClassifier() <NA> 0.710 15.000 1,493.000 33.000 27.000 0.914 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') OneHotEncoder()
16 0.746 0.708 0.785 XGBClassifier() <NA> <NA> 3.000 931.000 <NA> <NA> <NA> <NA> 0.029 7.000 0.930 0.817 0.898 0.000 2.733 SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
17 0.746 0.716 0.776 LogisticRegression() 23.327 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
18 0.744 0.698 0.790 ExtraTreesClassifier() <NA> 0.328 5.000 1,047.000 23.000 43.000 0.957 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None PCA('mle') OneHotEncoder()
19 0.744 0.709 0.779 RandomForestClassifier() <NA> 0.567 38.000 1,060.000 19.000 41.000 0.656 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') OneHotEncoder()
20 0.742 0.686 0.798 XGBClassifier() <NA> <NA> 10.000 1,146.000 <NA> <NA> <NA> <NA> 0.025 14.000 0.771 0.548 0.748 0.093 1.892 SimpleImputer(strategy='median') None PCA('mle') OneHotEncoder()
21 0.738 0.686 0.790 XGBClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
22 0.736 0.695 0.777 ExtraTreesClassifier() <NA> 0.740 14.000 1,645.000 5.000 43.000 0.741 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
23 0.734 0.695 0.773 RandomForestClassifier() <NA> 0.770 70.000 1,570.000 16.000 39.000 0.910 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None CustomOrdinalEncoder()
24 0.730 0.702 0.758 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
25 0.727 0.690 0.765 LinearSVC() 0.361 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
26 0.727 0.689 0.765 LinearSVC() 0.746 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() None CustomOrdinalEncoder()
27 0.726 0.697 0.755 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()
28 0.714 0.679 0.749 XGBClassifier() <NA> <NA> 4.000 1,181.000 <NA> <NA> <NA> <NA> 0.067 7.000 0.557 0.763 0.592 0.001 2.984 SimpleImputer(strategy='median') None PCA('mle') CustomOrdinalEncoder()
29 0.701 0.669 0.733 LinearSVC() 10.021 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
30 0.681 0.646 0.717 LinearSVC() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None CustomOrdinalEncoder()
In [8]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
Out[8]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer pca encoder
1 0.767 0.720 0.814 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None OneHotEncoder()
2 0.755 0.714 0.796 0.599 70.000 1,858.000 39.000 22.000 0.851 gini SimpleImputer(strategy='most_frequent') None OneHotEncoder()
3 0.753 0.716 0.791 0.303 81.000 1,063.000 15.000 27.000 0.502 gini SimpleImputer(strategy='median') None OneHotEncoder()
4 0.751 0.714 0.788 0.323 76.000 1,619.000 31.000 36.000 0.595 gini SimpleImputer() None OneHotEncoder()
5 0.744 0.709 0.779 0.567 38.000 1,060.000 19.000 41.000 0.656 entropy SimpleImputer() PCA('mle') OneHotEncoder()
6 0.734 0.695 0.773 0.770 70.000 1,570.000 16.000 39.000 0.910 entropy SimpleImputer(strategy='most_frequent') None CustomOrdinalEncoder()
In [9]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
Out[9]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI C imputer scaler pca encoder
1 0.774 0.730 0.818 0.132 SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
2 0.763 0.725 0.802 <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
3 0.761 0.697 0.825 0.001 SimpleImputer(strategy='median') MinMaxScaler() None OneHotEncoder()
4 0.746 0.716 0.776 23.327 SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
5 0.730 0.702 0.758 0.000 SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
6 0.726 0.697 0.755 0.000 SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()

BayesSearchCV Performance Over Time¶

In [10]:
results.plot_performance_across_trials(facet_by='model').show()
In [11]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()

Variable Performance Over Time¶

In [12]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()

Scatter Matrix¶

In [13]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
#                             height=1000, width=1000).show()

Variable Performance - Numeric¶

In [14]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
                                        height=800)
In [15]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()

Variable Performance - Non-Numeric¶

In [16]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()

In [17]:
results.plot_score_vs_parameter(
    query='model == "RandomForestClassifier()"',
    parameter='max_features',
    size='max_depth',
    color='encoder',
)

In [18]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='max_depth'
# )
In [19]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='imputer'
# )